#' formula version of xgboost
#' @param data Dataframe to create model
#' @param formula Formula for model
#' @param nrounds Maximum number of iteration of training
#' @param weights Weight of data for modeling
#' @param watchlist_rate Ratio of validation data to watch learning process
#' @param na.action How to handle data with na
#' Can be na.omit, na.pass, na.fail
#' @param sparse If matrix should be sparse.
#' As default, it becomes sparse if there is any categorical value.
#' @export
fml_xgboost <- function(data, formula, nrounds= 10, weights = NULL, watchlist_rate = 0, na.action = na.pass, sparse = NULL, ...) {
term <- terms(formula, data = data)
# do.call is used to substitute weights
df_for_model_matrix <- tryCatch({
do.call(model.frame, list(term, data = data, weights = substitute(weights), na.action = na.action))
}, error = function(e){
if(e$message == "missing values in object"){
# this happens when na.action argument is na.fail
stop("There are NAs in the training data. You might want to set 'na.action' parameter to 'na.pass' or impute NAs manually.")
}
stop(e)
})
if(nrow(df_for_model_matrix) == 0){
# this might happen when na.action is na.omit
stop("No valid data to create xgboost model after removing NA.")
} else {
y <- model.response(df_for_model_matrix)
# NA in y causes an error
df_for_model_matrix <- df_for_model_matrix[!is.na(y), ]
y <- y[!is.na(y)]
md_mat <- tryCatch({
if(is.null(sparse)){
sparse <- FALSE
# If any variable is factor, it uses sparse matrix
for(var in all.vars(lazyeval::f_rhs(term))){
if(is.factor(data[[var]])) {
sparse <- TRUE
}
}
}
if(sparse){
tryCatch({
Matrix::sparse.model.matrix(term, data = df_for_model_matrix)
}, error = function(e){
if (e$message == "fnames == names(mf) are not all TRUE"){
# if there are not clean column names like including spaces or special characters,
# Matrix::sparse.model.matrix causes this error
stop("EXP-ANA-3 :: [] :: Invalid column names are found. Please run clean_names function beforehand.")
}
stop(e)
})
} else {
model.matrix(term, data = df_for_model_matrix, contrasts = FALSE)
}
}, error = function(e){
if(e$message == "contrasts can be applied only to factors with 2 or more levels") {
stop("more than 1 unique values are expected for categorical columns assigned as predictors")
}
stop(e)
})
weight <- model.weights(df_for_model_matrix)
ret <- if(watchlist_rate != 0.0) {
if (watchlist_rate < 0 || 1 <= watchlist_rate) {
stop("watchlist_rate must be between 0 and 1")
}
# create validation data set
index <- sample(seq(nrow(md_mat)), ceiling(nrow(md_mat) * watchlist_rate))
watch_mat <- xgboost::xgb.DMatrix(data = safe_slice(md_mat ,index), label = y[index])
train_mat <- xgboost::xgb.DMatrix(data = safe_slice(md_mat ,index, remove = TRUE), label = y[-index])
xgboost::xgb.train(data = train_mat, watchlist = list(train = train_mat, validation = watch_mat), label = y, weight = weight, nrounds = nrounds, ...)
} else {
xgboost::xgboost(data = md_mat, label = y, weight = weight, nrounds = nrounds, ...)
}
ret$terms <- term
# To avoid saving a huge environment when caching with RDS.
attr(ret$terms,".Environment") <- NULL
ret$x_names <- colnames(md_mat)
ret$is_sparse <- sparse
pred_cnames <- all.vars(term)[-1]
# this is how categorical columns are casted to columns in matrix
# this is needed in augment, so that matrix with the same levels
# can be created
ret$xlevels <- .getXlevels(term, df_for_model_matrix)
ret
}
}
#' formula version of xgboost (multinomial)
#' @param output_type Type of output. Can be "logistic" or "logitraw"
#' The explanation is in https://www.r-bloggers.com/with-our-powers-combined-xgboost-and-pipelearner/
#' @export
xgboost_binary <- function(data, formula, output_type = "logistic", eval_metric = "auc", params = list(), ...) {
# there can be more than 2 eval_metric
# by creating eval_metric parameters in params list
# but specify only one so that it is clear what is used for early stopping.
metric_list <- list(eval_metric = eval_metric)
params <- append(metric_list, params)
vars <- all.vars(formula)
y_name <- vars[[1]]
y_vals <- data[[y_name]]
# this is used to get back original values from predicted output
label_levels <- c(0, 1)
if(is.logical(y_vals)) {
label_levels <- c(FALSE, TRUE)
y_vals <- as.numeric(y_vals)
} else if (!all(y_vals[!is.na(y_vals)] %in% c(0, 1))){
# there are values that are not 0 or 1, so should be mapped as factor
factored <- as.factor(data[[y_name]])
label_levels <- to_same_type(levels(factored), data[[y_name]])
if(length(label_levels) != 2){
stop("target variable must have 2 unique values")
}
y_vals <- as.numeric(factored) - 1
}
data[[y_name]] <- y_vals
objective <- paste0("binary:", output_type, sep = "")
ret <- tryCatch({
fml_xgboost(data = data,
formula = formula,
objective = objective,
params = params,
...)
}, error = function(e){
if(stringr::str_detect(e$message, "Check failed: !auc_error AUC: the dataset only contains pos or neg samples")){
stop("The target only contains positive or negative values")
}
stop(e)
})
# add class to control S3 methods
class(ret) <- c("xgboost_binary", class(ret))
ret$y_levels <- label_levels
original_colnames <- colnames(data)
# this attribute will be used to get back original column names
terms_mapping <- original_colnames
names(terms_mapping) <- original_colnames # No column name change. Just to work with the rest of the code.
ret$terms_mapping <- terms_mapping
ret
}
#' formula version of xgboost (multinomial)
#' @param output_type Type of output. Can be "softprob" or "softmax"
#' The explanation is in https://www.r-bloggers.com/with-our-powers-combined-xgboost-and-pipelearner/
#' @export
xgboost_multi <- function(data, formula, output_type = "softprob", eval_metric = "merror", params = list(), ...) {
# there can be more than 2 eval_metric
# by creating eval_metric parameters in params list
# but specify only one so that it is clear what is used for early stopping.
metric_list <- list(eval_metric = eval_metric)
params <- append(metric_list, params)
vars <- all.vars(formula)
y_name <- vars[[1]]
y_vals <- data[[y_name]]
# this is used to get back original values from predicted output
label_levels <- if(is.logical(y_vals)) {
c(FALSE, TRUE)
} else if (is.factor(y_vals)) {
# this is sorted unique factor
# will be used to re-construct factor from index vector with the same level
sort(unique(data[[y_name]]))
} else {
factored <- as.factor(data[[y_name]])
to_same_type(levels(factored), data[[y_name]])
}
if(is.logical(y_vals)) {
y_vals <- as.numeric(y_vals)
} else if (is.factor(y_vals)) {
# Map the levels to integers from 0 to (number of levels - 1)
mapping <- 0:(length(label_levels)-1)
names(mapping) <- as.character(label_levels)
y_vals <- mapping[as.character(y_vals)]
} else {
factored <- as.factor(data[[y_name]])
y_vals <- as.numeric(factored) - 1
}
data[[y_name]] <- y_vals
objective <- paste0("multi:", output_type, sep = "")
ret <- fml_xgboost(data = data,
formula = formula,
objective = objective,
num_class = length(label_levels),
params = params,
...)
# add class to control S3 methods
class(ret) <- c("xgboost_multi", class(ret))
ret$y_levels <- label_levels
original_colnames <- colnames(data)
# this attribute will be used to get back original column names
terms_mapping <- original_colnames
names(terms_mapping) <- original_colnames # No column name change. Just to work with the rest of the code.
ret$terms_mapping <- terms_mapping
ret
}
#' formula version of xgboost (regression)
#' @param output_type Type of output. Can be "linear", "logistic", "gamma" or "tweedie"
#' The explanation is in https://www.r-bloggers.com/with-our-powers-combined-xgboost-and-pipelearner/
#' @export
xgboost_reg <- function(data, formula, output_type = "linear", eval_metric = NULL, params = list(), tweedie_variance_power = 1.5, ...) {
# There can be more than 2 eval_metric
# by creating eval_metric parameters in params list,
# but specify only one so that it is clear what is used for early stopping.
if (is.null(eval_metric)) {
if(output_type == "gamma"){
eval_metric <- "gamma-nloglik"
}
else if(output_type == "tweedie"){
tvp <- if(!is.null(params$tweedie_variance_power)){
params$tweedie_variance_power
} else {
params$tweedie_variance_power <- tweedie_variance_power
tweedie_variance_power
}
eval_metric <- paste0("tweedie-nloglik@", tvp)
}
else {
eval_metric <- "rmse"
}
}
metric_list <- list(eval_metric = eval_metric)
params <- append(metric_list, params)
vars <- all.vars(formula)
y_name <- vars[1]
data[[y_name]] <- as.numeric(data[[y_name]])
objective <- paste0("reg:", output_type, sep = "")
ret <- fml_xgboost(data = data,
formula = formula,
objective = objective,
params = params,
...)
# add class to control S3 methods
class(ret) <- c("xgboost_reg", class(ret))
original_colnames <- colnames(data)
# this attribute will be used to get back original column names
terms_mapping <- original_colnames
names(terms_mapping) <- original_colnames # No column name change. Just to work with the rest of the code.
ret$terms_mapping <- terms_mapping
ret
}
#' Augment predicted values
#' @param x xgb.Booster model
#' @param data Data frame used to train xgb.Booster
#' @param newdata New data frame to predict
#' @param ... Not used for now.
#' @export
augment.xgboost_multi <- function(x, data = NULL, newdata = NULL, data_type = "training", ...) {
loadNamespace("xgboost") # This is necessary for predict() to successfully figure out which function to call internally.
predictor_variables <- all.vars(x$terms)[-1]
predictor_variables_orig <- x$terms_mapping[predictor_variables]
if(!is.null(newdata)) { # Unlike ranger case, there is no prediction result kept in the model in case of xgboost.
# Replay the mutations on predictors.
if(!is.null(x$predictor_funs)) {
newdata <- newdata %>% mutate_predictors(x$orig_predictor_cols, x$predictor_funs)
}
class(x) <- class(x)[class(x) != c("xgboost_multi")]
# create clean name data frame because the model learned by those names
original_data <- if(!is.null(newdata)){
newdata
} else {
data
}
cleaned_data <- original_data %>% dplyr::select(predictor_variables_orig)
# Rename columns to the normalized ones used while learning.
colnames(cleaned_data) <- predictor_variables
# Align factor levels including Others and (Missing) to the model. TODO: factor level order can be different from the model training data. Is this ok?
if (!is.null(x$df)) { # Model on Analytics Step does not have x$df.
cleaned_data <- align_predictor_factor_levels(cleaned_data, x$df, predictor_variables)
}
# For new data prediction, xgboost can predict with NAs in the predictors. Disable NA filtering for now.
# na_row_numbers <- ranger.find_na(predictor_variables, cleaned_data)
na_row_numbers <- c()
if (length(na_row_numbers) > 0) {
cleaned_data <- cleaned_data[-na_row_numbers,]
}
if (nrow(cleaned_data) == 0) {
return(data.frame())
}
# Run prediction.
predicted <- predict_xgboost(x, cleaned_data)
obj <- x$params$objective
if (obj == "multi:softmax") {
predicted_label_col <- avoid_conflict(colnames(original_data), "predicted_label")
# fill rows with NA
original_data[[predicted_label_col]] <- predicted
} else if (obj == "multi:softprob") {
predicted_prob_col <- avoid_conflict(colnames(original_data), "predicted_probability")
colmax <- max.col(predicted)
# get max probabilities from each row
max_prob <- predicted[(colmax - 1) * nrow(predicted) + seq(nrow(predicted))]
max_prob <- restore_na(max_prob, na_row_numbers)
predicted_label <- x$y_levels[colmax]
predicted_label <- restore_na(predicted_label, na_row_numbers)
original_data <- ranger.set_multi_predicted_values(original_data, as.data.frame(predicted), predicted_label, max_prob, na_row_numbers)
}
original_data
} else if (!is.null(data)) { # For Analytics View.
if (nrow(data) == 0) { #TODO: better place to do this check?
return(data.frame())
}
data <- data %>% dplyr::relocate(!!rlang::sym(x$orig_target_col), .after = last_col()) # Bring the target column to the last so that it is next to the predicted value in the output.
predicted_prob_col <- avoid_conflict(colnames(data), "predicted_probability")
switch(data_type,
training = {
# Inserting removed NA rows should not be necessary since we don't remove NA rows after test/training split.
predicted_value <- extract_predicted_multiclass_labels(x, type="training")
predicted <- extract_predicted(x, type="training")
# predicted_prob_col is a column for probabilities of chosen values
colmax <- max.col(predicted)
max_prob <- predicted[(colmax - 1) * nrow(predicted) + seq(nrow(predicted))]
data <- ranger.set_multi_predicted_values(data, as.data.frame(predicted), predicted_value, max_prob, c())
data
},
test = {
predicted_value_nona <- extract_predicted_multiclass_labels(x, type="test")
predicted_value_nona <- restore_na(predicted_value_nona, attr(x$prediction_test, "unknown_category_rows_index"))
predicted_value <- restore_na(predicted_value_nona, attr(x$prediction_test, "na.action"))
predicted <- extract_predicted(x, type="test")
# predicted_prob_col is a column for probabilities of chosen values
colmax <- max.col(predicted)
max_prob_nona <- predicted[(colmax - 1) * nrow(predicted) + seq(nrow(predicted))]
max_prob_nona <- restore_na(max_prob_nona, attr(x$prediction_test, "unknown_category_rows_index"))
max_prob <- restore_na(max_prob_nona, attr(x$prediction_test, "na.action"))
data <- ranger.set_multi_predicted_values(data, as.data.frame(predicted), predicted_value, max_prob, attr(x$prediction_test, "na.action"), attr(x$prediction_test, "unknown_category_rows_index"))
data
})
}
}
#' Augment predicted values for binary task
#' @param x xgb.Booster model
#' @param data Data frame used to train xgb.Booster
#' @param newdata New data frame to predict
#' @param ... Not used for now.
#' @export
augment.xgboost_binary <- function(x, data = NULL, newdata = NULL, data_type = "training", binary_classification_threshold = 0.5, ...) {
loadNamespace("xgboost") # This is necessary for predict() to successfully figure out which function to call internally.
predictor_variables <- all.vars(x$terms)[-1]
predictor_variables_orig <- x$terms_mapping[predictor_variables]
# create clean name data frame because the model learned by those names
if(!is.null(newdata)) { # Unlike ranger case, there is no prediction result kept in the model in case of xgboost.
# Replay the mutations on predictors.
if(!is.null(x$predictor_funs)) {
newdata <- newdata %>% mutate_predictors(x$orig_predictor_cols, x$predictor_funs)
}
original_data <- newdata
cleaned_data <- original_data %>% dplyr::select(predictor_variables_orig)
# Rename columns to the normalized ones used while learning.
colnames(cleaned_data) <- predictor_variables
# Align factor levels including Others and (Missing) to the model.
if (!is.null(x$df)) { # Model on Analytics Step does not have x$df.
cleaned_data <- align_predictor_factor_levels(cleaned_data, x$df, predictor_variables)
}
# For new data prediction, xgboost can predict with NAs in the predictors. Disable NA filtering for now.
# na_row_numbers <- ranger.find_na(predictor_variables, cleaned_data)
na_row_numbers <- c()
if (length(na_row_numbers) > 0) {
cleaned_data <- cleaned_data[-na_row_numbers,]
}
if (nrow(cleaned_data) == 0) {
return(data.frame())
}
# Run prediction.
predicted_val <- predict_xgboost(x, cleaned_data)
# Inserting once removed NA rows
predicted <- restore_na(predicted_val, na_row_numbers)
obj <- x$params$objective
predicted_val_col <- avoid_conflict(colnames(original_data), "predicted_label")
predicted_prob_col <- avoid_conflict(colnames(original_data), "predicted_probability")
prob <- if (obj == "binary:logistic") {
original_data[[predicted_prob_col]] <- predicted
original_data[[predicted_val_col]] <- predicted > binary_classification_threshold
predicted
} else if (obj == "binary:logitraw") {
# binary:logitraw returns logit values
prob <- boot::inv.logit(predicted)
original_data[[predicted_prob_col]] <- prob
original_data[[predicted_val_col]] <- predicted
prob
} else {
stop(paste0("object type ", obj, " is not supported"))
}
original_data
} else if (!is.null(data)) { # For Analytics View.
if (nrow(data) == 0) { #TODO: better place to do this check?
return(data.frame())
}
data <- data %>% dplyr::relocate(!!rlang::sym(x$orig_target_col), .after = last_col()) # Bring the target column to the last so that it is next to the predicted value in the output.
predicted_value_col <- avoid_conflict(colnames(data), "predicted_label")
predicted_probability_col <- avoid_conflict(colnames(data), "predicted_probability")
switch(data_type,
training = {
# Inserting removed NA rows should not be necessary since we don't remove NA rows after test/training split.
predicted_prob <- extract_predicted(x)
predicted_value <- extract_predicted_binary_labels(x, threshold = binary_classification_threshold)
},
test = {
predicted_prob_nona <- extract_predicted(x, type="test")
predicted_value_nona <- extract_predicted_binary_labels(x, type="test", threshold = binary_classification_threshold)
predicted_prob_nona <- restore_na(predicted_prob_nona, attr(x$prediction_test, "unknown_category_rows_index"))
predicted_value_nona <- restore_na(predicted_value_nona, attr(x$prediction_test, "unknown_category_rows_index"))
predicted_prob <- restore_na(predicted_prob_nona, attr(x$prediction_test, "na.action"))
predicted_value <- restore_na(predicted_value_nona, attr(x$prediction_test, "na.action"))
})
data[[predicted_value_col]] <- predicted_value
data[[predicted_probability_col]] <- predicted_prob
data
}
}
#' Augment predicted values
#' @param x xgb.Booster model
#' @param data Data frame used to train xgb.Booster
#' @param newdata New data frame to predict
#' @param ... Not used for now.
#' @export
augment.xgboost_reg <- function(x, data = NULL, newdata = NULL, data_type = "training", ...) {
loadNamespace("xgboost") # This is necessary for predict() to successfully figure out which function to call internally.
predicted_value_col <- avoid_conflict(colnames(newdata), "predicted_value")
predictor_variables <- all.vars(x$terms)[-1]
predictor_variables_orig <- x$terms_mapping[predictor_variables]
if(!is.null(newdata)) { # Unlike ranger case, there is no prediction result kept in the model in case of xgboost.
# Replay the mutations on predictors.
if(!is.null(x$predictor_funs)) {
newdata <- newdata %>% mutate_predictors(x$orig_predictor_cols, x$predictor_funs)
}
# create clean name data frame because the model learned by those names
original_data <- if(!is.null(newdata)){
newdata
} else {
data
}
cleaned_data <- original_data %>% dplyr::select(predictor_variables_orig)
# Rename columns to the normalized ones used while learning.
colnames(cleaned_data) <- predictor_variables
# Align factor levels including Others and (Missing) to the model.
if (!is.null(x$df)) { # Model on Analytics Step does not have x$df.
cleaned_data <- align_predictor_factor_levels(cleaned_data, x$df, predictor_variables)
}
# For new data prediction, xgboost can predict with NAs in the predictors. Disable NA filtering for now.
# na_row_numbers <- ranger.find_na(predictor_variables, cleaned_data)
na_row_numbers <- c()
if (length(na_row_numbers) > 0) {
cleaned_data <- cleaned_data[-na_row_numbers,]
}
if (nrow(cleaned_data) == 0) {
return(data.frame())
}
# Run prediction.
predicted_val <- predict_xgboost(x, cleaned_data)
# Inserting once removed NA rows
original_data[[predicted_value_col]] <- restore_na(predicted_val, na_row_numbers)
original_data
} else if (!is.null(data)) { # For Analytics View.
if (nrow(data) == 0) { #TODO: better place to do this check?
return(data.frame())
}
data <- data %>% dplyr::relocate(!!rlang::sym(x$orig_target_col), .after = last_col()) # Bring the target column to the last so that it is next to the predicted value in the output.
switch(data_type,
training = {
predicted_value_col <- avoid_conflict(colnames(data), "predicted_value")
# Inserting removed NA rows should not be necessary since we don't remove NA rows after test/training split.
predicted <- extract_predicted(x)
data[[predicted_value_col]] <- predicted
data
},
test = {
predicted_value_col <- avoid_conflict(colnames(data), "predicted_value")
# Inserting removed NA rows
predicted_nona <- extract_predicted(x, type="test")
predicted_nona <- restore_na(predicted_nona, attr(x$prediction_test, "unknown_category_rows_index"))
predicted <- restore_na(predicted_nona, attr(x$prediction_test, "na.action"))
data[[predicted_value_col]] <- predicted
data
})
}
}
#' @export
augment.xgboost_exp <- function(x, data = NULL, newdata = NULL, ...) {
if ("xgboost_reg" %in% class(x)) {
augment.xgboost_reg(x, data, newdata, ...)
}
else if ("xgboost_binary" %in% class(x)) {
augment.xgboost_binary(x, data, newdata, ...)
}
else if ("xgboost_multi" %in% class(x)) {
augment.xgboost_multi(x, data, newdata, ...)
}
}
#' Augment predicted values
#' @param x xgb.Booster model
#' @param data Data frame used to train xgb.Booster
#' @param newdata New data frame to predict
#' @param ... Not used for now.
#' @export
augment.xgb.Booster <- function(x, data = NULL, newdata = NULL, ...) {
loadNamespace("xgboost") # This is necessary for predict() to successfully figure out which function to call internally.
class(x) <- class(x)[class(x) != "xgboost_binary" &
class(x) != "xgboost_multi" &
class(x) != "xgboost_reg" &
class(x) != "fml_xgboost"]
if(!is.null(newdata)){
data <- newdata
}
mat_data <- if(!is.null(x$x_names)) {
data[x$x_names]
} else {
# use all data if there is no x_names
data
}
mat <- as.matrix(mat_data)
# predict of xgboost expects double matrix as input
if(is.integer(mat) || is.logical(mat)){
mat <- matrix(as.numeric(mat), ncol = ncol(mat))
}
predicted <- stats::predict(x, mat)
predicted_value_col <- avoid_conflict(colnames(data), "predicted_value")
data[[predicted_value_col]] <- predicted
data
}
#' Tidy method for xgboost output
#' @param x Fitted xgb.Booster model
#' @param type Can be "weight" or "log".
#' "weight" returns importances of features and "log" returns evaluation log.
#' @param ... Not used for now.
#' @export
tidy.xgb.Booster <- function(x, type="weight", pretty.name = FALSE, ...){
# xgboost_? class must be removed for xgboost::xgb.importance to work
class(x) <- class(x)[class(x)!="xgboost_binary" &
class(x)!="xgboost_multi" &
class(x)!="xgboost_reg" &
class(x) != "fml_xgboost"]
ret <- tryCatch({
ret <- xgboost::xgb.importance(feature_names = x$x_names,
model=x) %>% as.data.frame()
if (pretty.name) {
colnames(ret)[colnames(ret)=="Gain"] <- "Importance"
colnames(ret)[colnames(ret)=="Cover"] <- "Coverage"
colnames(ret)[colnames(ret)=="RealCover"] <- "Real Coverage"
colnames(ret)[colnames(ret)=="RealCover %"] <- "Real Coverage %"
} else {
colnames(ret)[colnames(ret)=="Feature"] <- "feature"
colnames(ret)[colnames(ret)=="Gain"] <- "importance"
colnames(ret)[colnames(ret)=="Cover"] <- "coverage"
colnames(ret)[colnames(ret)=="Frequency"] <- "frequency"
colnames(ret)[colnames(ret)=="Split"] <- "split"
colnames(ret)[colnames(ret)=="RealCover"] <- "real_coverage"
colnames(ret)[colnames(ret)=="RealCover %"] <- "real_coverage_pct"
colnames(ret)[colnames(ret)=="Weight"] <- "weight"
}
ret
}, error = function(e){
data.frame(Error = e$message)
})
ret
}
prettify_xgboost_evaluation_log <- function(df, pretty.name=FALSE) {
ret <- df
if(pretty.name){
colnames(ret)[colnames(ret) == "iter"] <- "Number of Iteration"
colnames(ret)[colnames(ret) == "train_rmse"] <- "RMSE"
colnames(ret)[colnames(ret) == "train_mae"] <- "Mean Absolute Error"
colnames(ret)[colnames(ret) == "train_logloss"] <- "Negative Log Likelihood"
# this can be train_error@{threshold}
with_train_error <- stringr::str_detect(colnames(ret), "^train_error")
colnames(ret)[with_train_error] <- stringr::str_replace(colnames(ret)[with_train_error], "^train_error", "Misclass. Rate")
colnames(ret)[colnames(ret) == "train_merror"] <- "Misclass. Rate" # this is for multiclass
colnames(ret)[colnames(ret) == "train_mlogloss"] <- "Multiclass Logloss"
colnames(ret)[colnames(ret) == "train_auc"] <- "AUC"
# this can be train_ndcg@{threshold}
with_train_ndcg <- stringr::str_detect(colnames(ret), "^train_ndcg")
colnames(ret)[with_train_ndcg] <- stringr::str_replace(colnames(ret)[with_train_ndcg], "^train_ndcg", "Normalized Discounted Cumulative Gain")
# this can be train_map@{threshold}
with_train_map <- stringr::str_detect(colnames(ret), "^train_map")
colnames(ret)[with_train_map] <- stringr::str_replace(colnames(ret)[with_train_map], "^train_map", "Mean Average Precision")
# this can be train_tweedie_nloglik@{rho}
with_train_tweedie_nloglik <- stringr::str_detect(colnames(ret), "^train_tweedie_nloglik")
colnames(ret)[with_train_tweedie_nloglik] <- stringr::str_replace(colnames(ret)[with_train_tweedie_nloglik], "^train_tweedie_nloglik", "Tweedie Negative Log Likelihood")
colnames(ret)[colnames(ret) == "train_gamma_nloglik"] <- "Gamma Negative Log Likelihood"
colnames(ret)[colnames(ret) == "train_gamma_deviance"] <- "Gamma Deviance"
colnames(ret)[colnames(ret) == "validation_rmse"] <- "Validation RMSE"
colnames(ret)[colnames(ret) == "validation_mae"] <- "Validation Mean Absolute Error"
colnames(ret)[colnames(ret) == "validation_logloss"] <- "Validation Negative Log Likelihood"
# this can be validation_error@{threshold}
with_validation_error <- stringr::str_detect(colnames(ret), "^validation_error")
colnames(ret)[with_validation_error] <- stringr::str_replace(colnames(ret)[with_validation_error], "^validation_error", "Validation Misclass. Rate")
colnames(ret)[colnames(ret) == "validation_merror"] <- "Validation Misclass. Rate" # this is for multiclass
colnames(ret)[colnames(ret) == "validation_mlogloss"] <- "Validation Multiclass Logloss"
colnames(ret)[colnames(ret) == "validation_auc"] <- "Validation AUC"
# this can be validation_ndcg@{threshold}
with_validation_ndcg <- stringr::str_detect(colnames(ret), "^validation_ndcg")
colnames(ret)[with_validation_ndcg] <- stringr::str_replace(colnames(ret)[with_validation_ndcg], "^validation_ndcg", "Validation Normalized Discounted Cumulative Gain")
# this can be validation_map@{threshold}
with_validation_map <- stringr::str_detect(colnames(ret), "^validation_map")
colnames(ret)[with_train_map] <- stringr::str_replace(colnames(ret)[with_train_map], "^train_map", "Validation Mean Average Precision")
# this can be validation_tweedie_nloglik@{rho}
with_validation_tweedie_nloglik <- stringr::str_detect(colnames(ret), "^validation_tweedie_nloglik")
colnames(ret)[with_validation_tweedie_nloglik] <- stringr::str_replace(colnames(ret)[with_validation_tweedie_nloglik], "^validation_tweedie_nloglik", "Validation Tweedie Negative Log Likelihood")
colnames(ret)[colnames(ret) == "validation_ndcg"] <- "Validation Normalized Discounted Cumulative Gain"
colnames(ret)[colnames(ret) == "validation_map"] <- "Validation Mean Average Precision"
colnames(ret)[colnames(ret) == "validation_gamma_nloglik"] <- "Validation Gamma Negative Log Likelihood"
colnames(ret)[colnames(ret) == "validation_gamma_deviance"] <- "Validation Gamma Deviance"
} else {
colnames(ret)[colnames(ret) == "iter"] <- "number_of_iteration"
colnames(ret)[colnames(ret) == "train_rmse"] <- "root_mean_square_error"
colnames(ret)[colnames(ret) == "train_mae"] <- "mean_absolute_error"
colnames(ret)[colnames(ret) == "train_logloss"] <- "negative_log_likelihood"
# this can be train_error@{threshold}
with_train_error <- stringr::str_detect(colnames(ret), "^train_error")
colnames(ret)[with_train_error] <- stringr::str_replace(colnames(ret)[with_train_error], "^train_error", "misclassification_rate")
colnames(ret)[colnames(ret) == "train_merror"] <- "misclassification_rate" # this is for multiclass
colnames(ret)[colnames(ret) == "train_mlogloss"] <- "multiclass_logloss"
colnames(ret)[colnames(ret) == "train_auc"] <- "auc"
# this can be train_ndcg@{threshold}
with_train_ndcg <- stringr::str_detect(colnames(ret), "^train_ndcg")
colnames(ret)[with_train_ndcg] <- stringr::str_replace(colnames(ret)[with_train_ndcg], "^train_ndcg", "normalized_discounted_cumulative_gain")
# this can be train_map@{threshold}
with_train_map <- stringr::str_detect(colnames(ret), "^train_map")
colnames(ret)[with_train_map] <- stringr::str_replace(colnames(ret)[with_train_map], "^train_map", "mean_average_precision")
# this can be train_tweedie_nloglik@{rho}
with_train_tweedie_nloglik <- stringr::str_detect(colnames(ret), "^train_tweedie_nloglik")
colnames(ret)[with_train_tweedie_nloglik] <- stringr::str_replace(colnames(ret)[with_train_tweedie_nloglik], "^train_tweedie_nloglik", "tweedie_negative_log_likelihood")
colnames(ret)[colnames(ret) == "train_gamma_nloglik"] <- "gamma_negative_log_likelihood"
colnames(ret)[colnames(ret) == "train_gamma_deviance"] <- "gamma_deviance"
colnames(ret)[colnames(ret) == "validation_rmse"] <- "validation_root_mean_square_error"
colnames(ret)[colnames(ret) == "validation_mae"] <- "validation_mean_absolute_error"
colnames(ret)[colnames(ret) == "validation_logloss"] <- "validation_negative_log_likelihood"
# this can be validation_error@{threshold}
with_validation_error <- stringr::str_detect(colnames(ret), "^validation_error")
colnames(ret)[with_validation_error] <- stringr::str_replace(colnames(ret)[with_validation_error], "^validation_error", "validation_misclassification_rate")
colnames(ret)[colnames(ret) == "validation_merror"] <- "validation_misclassification_rate" # this is for multiclass
colnames(ret)[colnames(ret) == "validation_mlogloss"] <- "validation_multiclass_logloss"
colnames(ret)[colnames(ret) == "validation_auc"] <- "validation_auc"
# this can be validation_ndcg@{threshold}
with_validation_ndcg <- stringr::str_detect(colnames(ret), "^validation_ndcg")
colnames(ret)[with_validation_ndcg] <- stringr::str_replace(colnames(ret)[with_validation_ndcg], "^validation_ndcg", "validation_normalized_discounted_cumulative_gain")
# this can be validation_map@{threshold}
with_validation_map <- stringr::str_detect(colnames(ret), "^validation_map")
colnames(ret)[with_train_map] <- stringr::str_replace(colnames(ret)[with_train_map], "^train_map", "validation_mean_average_precision")
# this can be validation_tweedie_nloglik@{rho}
with_validation_tweedie_nloglik <- stringr::str_detect(colnames(ret), "^validation_tweedie_nloglik")
colnames(ret)[with_validation_tweedie_nloglik] <- stringr::str_replace(colnames(ret)[with_validation_tweedie_nloglik], "^validation_tweedie_nloglik", "validation_tweedie_negative_log_likelihood")
colnames(ret)[colnames(ret) == "validation_gamma_nloglik"] <- "validation_gamma_negative_log_likelihood"
colnames(ret)[colnames(ret) == "validation_gamma_deviance"] <- "validation_gamma_deviance"
}
ret
}
#' Glance for xgb.Booster model
#' @param x xgb.Booster model
#' @param ... Not used for now.
#' @export
glance.xgb.Booster <- function(x, pretty.name = FALSE, ...) {
# data frame with
# number of iteration
# with chosen evaluation metrics
ret <- x$evaluation_log %>% as.data.frame()
ret <- prettify_xgboost_evaluation_log(ret, pretty.name=TRUE)
ret
}
# XGBoost prediction function that takes data frame rather than matrix.
predict_xgboost <- function(model, df) {
y_name <- all.vars(model$terms)[[1]]
if(is.null(df[[y_name]])){
# if there is no column in the formula (even if it's response variable),
# model.matrix function causes an error
# so create the column with 0
df[[y_name]] <- rep(0, nrow(df))
}
mat_data <- if(!is.null(model$is_sparse) && model$is_sparse){
Matrix::sparse.model.matrix(model$terms, data = model.frame(df, na.action = na.pass, xlev = model$xlevels))
} else {
model.matrix(model$terms, model.frame(df, na.action = na.pass, xlev = model$xlevels))
}
predicted <- stats::predict(model, mat_data)
obj <- model$params$objective
if (obj == "multi:softprob") {
# predicted is a vector containing probabilities for each class
probs <- matrix(predicted, nrow = length(model$y_levels)) %>% t()
# probs <- fill_mat_NA(as.numeric(rownames(mat_data)), probs, nrow(df)) # code from xgboost step.
probs <- fill_mat_NA(1:nrow(df), probs, nrow(df)) # Not sure if this is necessary. TODO: check.
colnames(probs) <- model$y_levels
predicted <- probs
# We return matrix in this case.
# Converting it to data.frame is the caller's responsibility.
}
else if (obj == "multi:softmax") {
predicted <- model$y_levels[predicted+1]
# fill rows with NA
predicted <- fill_vec_NA(as.integer(rownames(mat_data)), predicted, nrow(df))
}
else { # TODO: Here we assume all other cases returns vector prediction result.
# model.matrix removes rows with NA and stats::predict returns a matrix
# whose number of rows is the same with its size,
# so the result should be filled by NA
predicted <- fill_vec_NA(as.integer(rownames(mat_data)), predicted, nrow(df))
}
predicted
}
# Calculates permutation importance for regression by xgboost.
calc_permutation_importance_xgboost_regression <- function(fit, target, vars, data) {
var_list <- as.list(vars)
importances <- purrr::map(var_list, function(var) {
mmpf::permutationImportance(data, var, target, fit, nperm = 1, # By default, it creates 100 permuted data sets. We do just 1 for performance.
predict.fun = function(object,newdata){predict_xgboost(object, newdata)},
# For some reason, default loss.fun, which is mean((x - y)^2) returns NA, even with na.rm=TRUE. Rewrote it with sum() to avoid the issue.
loss.fun = function(x,y){sum((x - y)^2, na.rm = TRUE)/length(x)})
})
importances <- purrr::flatten_dbl(importances)
importances_df <- tibble::tibble(variable=vars, importance=pmax(importances, 0)) # Show 0 for negative importance, which can be caused by chance in case of permutation importance.
importances_df <- importances_df %>% dplyr::arrange(-importance)
importances_df
}
calc_permutation_importance_xgboost_binary <- function(fit, target, vars, data) {
var_list <- as.list(vars)
importances <- purrr::map(var_list, function(var) {
mmpf::permutationImportance(data, var, target, fit, nperm = 1, # By default, it creates 100 permuted data sets. We do just 1 for performance.
predict.fun = function(object,newdata){predict_xgboost(object, newdata)},
loss.fun = function(x,y){-sum(log(1- abs(x - y[[1]])), na.rm = TRUE)} # Negative-log-likelihood-based loss function.
# loss.fun = function(x,y){-auroc(x,y[[1]])} # AUC based. y is actually a single column data.frame rather than a vector. TODO: Fix it in permutationImportance() to make it a vector.
)
})
importances <- purrr::flatten_dbl(importances)
importances_df <- tibble(variable=vars, importance=pmax(importances, 0)) # Show 0 for negative importance, which can be caused by chance in case of permutation importance.
importances_df <- importances_df %>% dplyr::arrange(-importance)
importances_df
}
calc_permutation_importance_xgboost_multiclass <- function(fit, target, vars, data) {
var_list <- as.list(vars)
importances <- purrr::map(var_list, function(var) {
mmpf::permutationImportance(data, var, target, fit, nperm = 1, # By default, it creates 100 permuted data sets. We do just 1 for performance.
predict.fun = function(object,newdata){predict_xgboost(object, newdata)},
# loss.fun = function(x,y){1-sum(colnames(x)[max.col(x)]==y[[1]], na.rm=TRUE)/length(y[[1]])} # misclassification rate
loss.fun = function(x,y){sum(-log(x[match(y[[1]][row(x)], colnames(x))==col(x)]), na.rm = TRUE)} # Negative log likelihood. https://ljvmiranda921.github.io/notebook/2017/08/13/softmax-and-the-negative-log-likelihood/
)
})
importances <- purrr::flatten_dbl(importances)
importances_df <- tibble(variable=vars, importance=pmax(importances, 0)) # Show 0 for negative importance, which can be caused by chance in case of permutation importance.
importances_df <- importances_df %>% dplyr::arrange(-importance)
importances_df
}
# TODO: Make this function model-agnostic and consolidate. There are similar code for lm/glm, ranger, rpart, and xgboost.
# Builds partial dependence data.
partial_dependence.xgboost <- function(fit, vars = colnames(data),
n = c(min(nrow(unique(data[, vars, drop = FALSE])), 25L), nrow(data)),
classification = FALSE, interaction = FALSE, uniform = TRUE, data, ...) {
target = all.vars(fit$terms)[[1]]
predict.fun = function(object, newdata) {
if (!classification) {
predict_xgboost(object, newdata)
} else {
# Returned prediction probability matrix works here as is.
predict_xgboost(object, newdata)
}
}
# Generate grid points based on quantile, so that FIRM calculated based on it would make good sense even when there are some outliers.
points <- list()
quantile_points <- list()
for (cname in vars) {
if (is.numeric(data[[cname]])) {
coldata <- data[[cname]]
minv <- min(coldata, na.rm=TRUE)
maxv <- max(coldata, na.rm=TRUE)
grid <- minv + (0:20)/20 * (maxv - minv)
quantile_grid <- quantile(coldata, probs=1:24/25)
quantile_points[[cname]] <- quantile_grid
points[[cname]] <- sort(unique(c(grid, quantile_grid)))
}
else {
points[[cname]] <- unique(data[[cname]])
}
}
args = list(
"data" = data,
"vars" = vars,
"n" = n,
"model" = fit,
"points" = points,
"predict.fun" = predict.fun,
...
)
if (length(vars) > 1L & !interaction) {
pd = rbindlist(sapply(vars, function(x) {
args$vars = x
if ("points" %in% names(args))
args$points = args$points[x]
mp = do.call(mmpf::marginalPrediction, args)
#if (fit$treetype == "Regression")
if (!classification)
names(mp)[ncol(mp)] = target
mp
}, simplify = FALSE), fill = TRUE)
data.table::setcolorder(pd, c(vars, colnames(pd)[!colnames(pd) %in% vars]))
} else {
pd = do.call(mmpf::marginalPrediction, args)
#if (fit$treetype == "Regression")
if (!classification) # TODO: If we give "regression" for binary classification, would it make sense here?
names(pd)[ncol(pd)] = target
}
attr(pd, "class") = c("pd", "data.frame")
attr(pd, "interaction") = interaction == TRUE
# Since levels(fit$y_levels) returns unused levels, it has to be as.character(fit$y_levels).
attr(pd, "target") = if (!classification) target else as.character(fit$y_levels)
attr(pd, "vars") = vars
attr(pd, "points") = points
attr(pd, "quantile_points") = quantile_points
pd
}
# This function should return following 2 columns.
# - variable - Name of variable
# - importance - Importance of the variable
# Rows should be sorted by importance in descending order.
importance_xgboost <- function(model) {
imp <- tidy.xgb.Booster(model)
if ("Error" %in% colnames(imp)) { # In case of Error, return error object to report this error without giving up on the entire model.
ret <- simpleError(imp$Error)
return(ret)
}
ret <- imp %>% dplyr::rename(variable=feature)
ret <- ret %>% dplyr::mutate(variable = stringr::str_extract(variable,'c\\d+_')) %>%
dplyr::group_by(variable) %>%
dplyr::summarize(importance = sum(importance, na.rm=TRUE)) #TODO: Does sum make sense to aggregate this importance?
ret <- ret %>% dplyr::arrange(-importance)
ret
}
# Model specific S3 functions.
extract_actual <- function(x, ...) {UseMethod("extract_actual", x)}
extract_predicted <- function(x, ...) {UseMethod("extract_predicted", x)}
extract_predicted_binary_labels <- function(x, ...) {UseMethod("extract_predicted_binary_labels", x)}
extract_predicted_multiclass_labels <- function(x, ...) {UseMethod("extract_predicted_multiclass_labels", x)}
get_prediction_type <- function(x, ...) {UseMethod("get_prediction_type", x)}
extract_actual.xgboost_exp <- function(x, type = "training") {
if (type == "training") {
actual <- x$df[[all.vars(x$terms)[[1]]]]
}
else {
actual <- x$df_test[[all.vars(x$terms)[[1]]]]
}
actual
}
extract_predicted.xgboost_exp <- function(x, type = "training") {
if (type == "training") {
predicted <- x$prediction_training
}
else {
predicted <- x$prediction_test
}
predicted
}
extract_predicted.xgboost_reg <- extract_predicted.xgboost_exp
extract_predicted.xgboost_binary <- extract_predicted.xgboost_exp
extract_predicted.xgboost_multi <- extract_predicted.xgboost_exp
extract_predicted_binary_labels.xgboost_exp <- function(x, threshold = 0.5, type = "training") {
if (type == "training") {
predicted <- x$prediction_training > threshold
}
else {
predicted <- x$prediction_test > threshold
}
predicted
}
extract_predicted_binary_labels.xgboost_binary <- extract_predicted_binary_labels.xgboost_exp
extract_predicted_multiclass_labels.xgboost_exp <- function(x, type = "training") {
if (type == "training") {
predicted <- x$y_levels[apply(x$prediction_training, 1, which.max)]
}
else {
predicted <- x$y_levels[apply(x$prediction_test, 1, which.max)]
}
predicted
}
extract_predicted_multiclass_labels.xgboost_multi <- extract_predicted_multiclass_labels.xgboost_exp
get_prediction_type.xgboost_exp <- function(x) {
if ("xgboost_reg" %in% class(x)) {
ret <- "regression"
}
if (x$classification_type == "binary") {
ret <- "binary"
}
else {
ret <- "multiclass"
}
ret
}
# Clean up data frame for test
# Removes NAs in predictors - TODO: Is this necessary given our preprocessing before this?
# Removes categorical values that do not appear in training data.
# Returns cleaned data frame.
# attr(ret, "na_row_numbers") - vector of row numbers of NA rows.
# attr(ret, "unknown_category_rows_index") - logical vector that is index of where the unknown category rows are after removing NA rows.
cleanup_df_for_test <- function(df_test, df_train, c_cols) {
na_row_numbers_test <- ranger.find_na(c_cols, data = df_test)
df_test_clean <- df_test
if (length(na_row_numbers_test) > 0) {
df_test_clean <- df_test_clean[-na_row_numbers_test,]
}
names(c_cols) <- NULL # This is necessary to make select in the following line work.
unknown_category_rows_index_vector <- get_unknown_category_rows_index_vector(df_test_clean, df_train %>% dplyr::select(!!!rlang::syms(c_cols)))
df_test_clean <- df_test_clean[!unknown_category_rows_index_vector, , drop = FALSE] # 2nd arg must be empty.
unknown_category_rows_index <- get_row_numbers_from_index_vector(unknown_category_rows_index_vector)
attr(df_test_clean, "na_row_numbers") <- na_row_numbers_test
attr(df_test_clean, "unknown_category_rows_index") <- unknown_category_rows_index
df_test_clean
}
#' Build XGBoost model for Analytics View.
#' @export
exp_xgboost <- function(df,
target,
...,
target_fun = NULL,
predictor_funs = NULL,
max_nrow = 50000, # Down from 200000 when we added partial dependence
# XGBoost-specific parameters
nrounds = 10,
watchlist_rate = 0,
sparse = FALSE,
booster = "gbtree",
early_stopping_rounds = NULL, # No early stop.
max_depth = 6,
min_child_weight = 1,
gamma = 1,
subsample = 1,
colsample_bytree = 1,
eta = 0.3, # We used to set learning_rate parameter here for Analytics Step. Corrected it while 6.2 development.
# Either xgboost package changed parameter name at some point,
# or we might have been wrong about the parameter name in the first place.
output_type_regression = "linear",
eval_metric_regression = "rmse",
output_type_binary = "logistic",
eval_metric_binary = "auc",
output_type_multiclass = "softprob",
eval_metric_multiclass = "merror",
# Model agnostic parameters
target_n = 20,
predictor_n = 12, # So that at least months can fit in it.
smote = FALSE,
smote_target_minority_perc = 40,
smote_max_synth_perc = 200,
smote_k = 5,
importance_measure = "permutation", # "permutation", "impurity", or "firm".
max_pd_vars = NULL,
# Number of most important variables to calculate partial dependences on.
# By default, when Boruta is on, all Confirmed/Tentative variables.
# 12 when Boruta is off.
pd_sample_size = 500,
pd_grid_resolution = 20,
pd_with_bin_means = FALSE, # Default is FALSE for backward compatibility on the server
# with_boruta = FALSE,
# boruta_max_runs = 20, # Maximal number of importance source runs.
# boruta_p_value = 0.05, # Boruta recommends using the default 0.01 for P-value, but we are using 0.05 for consistency with other functions of ours.
seed = 1,
test_rate = 0.0,
test_split_type = "random" # "random" or "ordered"
){
if(test_rate < 0 | 1 < test_rate){
stop("test_rate must be between 0 and 1")
} else if (test_rate == 1){
stop("test_rate must be less than 1")
}
# this seems to be the new way of NSE column selection evaluation
# ref: https://github.com/tidyverse/tidyr/blob/3b0f946d507f53afb86ea625149bbee3a00c83f6/R/spread.R
target_col <- tidyselect::vars_select(names(df), !! rlang::enquo(target))
# this evaluates select arguments like starts_with
orig_selected_cols <- tidyselect::vars_select(names(df), !!! rlang::quos(...))
target_funs <- NULL
if (!is.null(target_fun)) {
target_funs <- list(target_fun)
names(target_funs) <- target_col
df <- df %>% mutate_predictors(target_col, target_funs)
}
if (!is.null(predictor_funs)) {
df <- df %>% mutate_predictors(orig_selected_cols, predictor_funs)
selected_cols <- names(unlist(predictor_funs))
}
else {
selected_cols <- orig_selected_cols
}
grouped_cols <- grouped_by(df)
# Sort predictors so that the result of permutation importance is stable against change of column order.
selected_cols <- stringr::str_sort(selected_cols)
# Remember if the target column was originally numeric or logical before converting type.
is_target_numeric <- is.numeric(df[[target_col]])
is_target_logical <- is.logical(df[[target_col]])
orig_levels <- NULL
if (is.factor(df[[target_col]])) {
orig_levels <- levels(df[[target_col]])
}
else if (is.logical(df[[target_col]])) {
orig_levels <- c("TRUE","FALSE")
}
clean_ret <- cleanup_df(df, target_col, selected_cols, grouped_cols, target_n, predictor_n)
clean_df <- clean_ret$clean_df
name_map <- clean_ret$name_map
clean_target_col <- clean_ret$clean_target_col
clean_cols <- clean_ret$clean_cols
# if target is numeric, it is regression but
# if not, it is classification
classification_type <- get_classification_type(clean_df[[clean_target_col]])
each_func <- function(df) {
tryCatch({
if(!is.null(seed)){
set.seed(seed)
}
# If we are to do SMOTE, do not down sample here and let exp_balance handle it so that we do not sample out precious minority data.
unique_val <- unique(df[[clean_target_col]])
if (smote && length(unique_val[!is.na(unique_val)]) == 2) {
sample_size <- NULL
}
else {
sample_size <- max_nrow
}
# Capture the classes of the columns at this point before preprocess_regression_data_after_sample called inside cleanup_df_per_group,
# so that we know the original classes of columns before characters are turned into factors,
# so that we can sort the partial dependence data for display accordingly.
# preprocess_regression_data_after_sample can remove columns, but it should not cause problem that we have more columns in
# orig_predictor_classes than the partial dependence data.
# Also, preprocess_regression_data_after_sample has code to add columns extracted from Date/POSIXct, but with recent releases,
# that should not happen, since the extraction is already done by mutate_predictors.
orig_predictor_classes <- capture_df_column_classes(df, clean_cols)
# XGBoost can work with NAs in numeric predictors. TODO: verify it.
# Also, no need to convert logical to factor unlike ranger.
clean_df_ret <- cleanup_df_per_group(df, clean_target_col, sample_size, clean_cols, name_map, predictor_n, filter_numeric_na=FALSE, convert_logical=FALSE)
if (is.null(clean_df_ret)) {
return(NULL) # skip this group
}
df <- clean_df_ret$df
c_cols <- clean_df_ret$c_cols
if (length(c_cols) == 0) {
# Previous version of message - stop("The selected predictor variables are invalid since they have only one unique values.")
stop("Invalid Predictors: Only one unique value.") # Message is made short so that it fits well in the Summary table.
}
name_map <- clean_df_ret$name_map
# apply smote if this is binary classification
unique_val <- unique(df[[clean_target_col]])
if (smote && length(unique_val[!is.na(unique_val)]) == 2) {
df <- df %>% exp_balance(clean_target_col, target_size = max_nrow, target_minority_perc = smote_target_minority_perc, max_synth_perc = smote_max_synth_perc, k = smote_k)
df <- df %>% dplyr::select(-synthesized) # Remove synthesized column added by exp_balance(). TODO: Handle it better. We might want to show it in resulting data.
}
# split training and test data
source_data <- df
test_index <- sample_df_index(source_data, rate = test_rate, ordered = (test_split_type == "ordered"))
df <- safe_slice(source_data, test_index, remove = TRUE)
if (test_rate > 0) {
df_test <- safe_slice(source_data, test_index, remove = FALSE)
}
# Restore source_data column name to original column name
rev_name_map <- names(name_map)
names(rev_name_map) <- name_map
colnames(source_data) <- rev_name_map[colnames(source_data)]
# build formula for randomForest
rhs <- paste0("`", c_cols, "`", collapse = " + ")
fml <- as.formula(paste(clean_target_col, " ~ ", rhs))
if (is_target_logical) {
model <- xgboost_binary(df, fml,
nrounds = nrounds,
watchlist_rate = watchlist_rate,
sparse = sparse,
booster = booster,
early_stopping_rounds = early_stopping_rounds,
max_depth = max_depth,
min_child_weight = min_child_weight,
gamma = gamma,
subsample = subsample,
colsample_bytree = colsample_bytree,
eta = eta,
output_type = output_type_binary,
eval_metric = eval_metric_binary)
}
else if(is_target_numeric) {
model <- xgboost_reg(df, fml,
nrounds = nrounds,
watchlist_rate = watchlist_rate,
sparse = sparse,
booster = booster,
early_stopping_rounds = early_stopping_rounds,
max_depth = max_depth,
min_child_weight = min_child_weight,
gamma = gamma,
subsample = subsample,
colsample_bytree = colsample_bytree,
eta = eta,
output_type = output_type_regression,
eval_metric = eval_metric_regression)
}
else {
model <- xgboost_multi(df, fml,
nrounds = nrounds,
watchlist_rate = watchlist_rate,
sparse = sparse,
booster = booster,
early_stopping_rounds = early_stopping_rounds,
max_depth = max_depth,
min_child_weight = min_child_weight,
gamma = gamma,
subsample = subsample,
colsample_bytree = colsample_bytree,
eta = eta,
output_type = output_type_multiclass,
eval_metric = eval_metric_multiclass)
}
class(model) <- c("xgboost_exp", class(model))
model$prediction_training <- predict_xgboost(model, df)
if (test_rate > 0) {
df_test_clean <- cleanup_df_for_test(df_test, df, c_cols)
na_row_numbers_test <- attr(df_test_clean, "na_row_numbers")
unknown_category_rows_index <- attr(df_test_clean, "unknown_category_rows_index")
prediction_test <- predict_xgboost(model, df_test_clean)
attr(prediction_test, "na.action") <- na_row_numbers_test
attr(prediction_test, "unknown_category_rows_index") <- unknown_category_rows_index
model$prediction_test <- prediction_test
model$df_test <- df_test_clean
}
if (is.null(max_pd_vars)) {
max_pd_vars <- 20 # Number of most important variables to calculate partial dependences on. This used to be 12 but we decided it was a little too small.
}
# return partial dependence
if (length(c_cols) > 1) { # Calculate importance only when there are multiple variables.
if (importance_measure == "permutation") {
if (is_target_logical) {
imp_df <- calc_permutation_importance_xgboost_binary(model, clean_target_col, c_cols, df)
}
else if (is_target_numeric) {
imp_df <- calc_permutation_importance_xgboost_regression(model, clean_target_col, c_cols, df)
}
else {
imp_df <- calc_permutation_importance_xgboost_multiclass(model, clean_target_col, c_cols, df)
}
model$imp_df <- imp_df
}
else if (importance_measure == "impurity" || # "impurity". Use the importance from the xgboost package.
importance_measure == "xgboost") { # Because of an old bug in UI definition, it was specified as "xgboost" in pre-6.5 Desktop. Covering backward compatibility here. Remove it at appropriate timing.
imp_df <- importance_xgboost(model)
model$imp_df <- imp_df
}
if (importance_measure == "firm") {
imp_vars <- c_cols # Just use c_cols as is for imp_vars to calculate partial dependence first before calculating FIRM.
}
else if ("error" %in% class(imp_df)) {
imp_vars <- c_cols # Just use c_cols as is for imp_vars to calculate partial dependence anyway.
imp_vars <- imp_vars[1:min(length(imp_vars), max_pd_vars)] # just take max_pd_vars first variables
}
else {
imp_vars <- imp_df$variable
imp_vars <- imp_vars[1:min(length(imp_vars), max_pd_vars)] # take max_pd_vars most important variables
}
}
else {
error <- simpleError("Variable importance requires two or more variables.")
model$imp_df <- error
imp_vars <- c_cols # Just use c_cols as is for imp_vars to calculate partial dependence anyway.
}
imp_vars <- as.character(imp_vars) # for some reason imp_vars is converted to factor at this point. turn it back to character.
model$imp_vars <- imp_vars
# Second element of n argument needs to be less than or equal to sample size, to avoid error.
if (length(imp_vars) > 0) {
model$partial_dependence <- partial_dependence.xgboost(model, vars=imp_vars, data=df,
n=c(pd_grid_resolution, min(nrow(df), pd_sample_size)),
classification=!(is_target_numeric||is_target_logical)) # We treat binary classification as a regression to predict probability here.
}
else {
model$partial_dependence <- NULL
}
if (importance_measure == "firm") { # If importance measure is FIRM, we calculate them now, after PDP is calculated.
if (length(c_cols) > 1) { # Calculate importance only when there are multiple variables.
pdp_target_col <- attr(model$partial_dependence, "target")
imp_df <- importance_firm(model$partial_dependence, pdp_target_col, imp_vars)
model$imp_df <- imp_df
imp_vars <- imp_df$variable
}
else {
error <- simpleError("Variable importance requires two or more variables.")
model$imp_df <- error
imp_vars <- c_cols # Just use c_cols as is for imp_vars to calculate partial dependence anyway.
}
imp_vars <- imp_vars[1:min(length(imp_vars), max_pd_vars)] # take max_pd_vars most important variables
model$imp_vars <- imp_vars
# Shrink the partial dependence data keeping only the important variables.
model$partial_dependence <- shrink_partial_dependence_data(model$partial_dependence, imp_vars)
}
if (length(imp_vars) > 0) {
if (pd_with_bin_means && (is_target_logical || is_target_numeric)) {
# We calculate means of bins only for logical or numeric target to keep the visualization simple.
model$partial_binning <- calc_partial_binning_data(df, clean_target_col, imp_vars)
}
}
# these attributes are used in tidy of randomForest
model$classification_type <- classification_type
model$orig_levels <- orig_levels
model$terms_mapping <- names(name_map)
names(model$terms_mapping) <- name_map
# model$y <- model.response(df) TODO: what was this??
model$df <- df
model$formula_terms <- terms(fml)
# To avoid saving a huge environment when caching with RDS.
attr(model$formula_terms,".Environment")<-NULL
model$sampled_nrow <- clean_df_ret$sampled_nrow
model$orig_target_col <- target_col # Used for relocating columns as well as for applying function.
if (!is.null(target_funs)) {
model$target_funs <- target_funs
}
if (!is.null(predictor_funs)) {
model$orig_predictor_cols <- orig_selected_cols
attr(predictor_funs, "LC_TIME") <- Sys.getlocale("LC_TIME")
attr(predictor_funs, "sysname") <- Sys.info()[["sysname"]] # Save platform name (e.g. Windows) since locale name might need conversion for the platform this model will be run on.
attr(predictor_funs, "lubridate.week.start") <- getOption("lubridate.week.start")
model$predictor_funs <- predictor_funs
}
model$orig_predictor_classes <- orig_predictor_classes
list(model = model, test_index = test_index, source_data = source_data)
}, error = function(e){
if(length(grouped_cols) > 0) {
# In repeat-by case, we report group-specific error in the Summary table,
# so that analysis on other groups can go on.
class(e) <- c("xgboost_exp", class(e))
list(model = e, test_index = NULL, source_data = NULL)
} else {
stop(e)
}
})
}
model_and_data_col <- "model_and_data"
ret <- do_on_each_group(clean_df, each_func, name = model_and_data_col, with_unnest = FALSE)
# It is necessary to nest in order to retrieve the result stored in model_and_data_col.
# If there is a group column, omit the group column from nest, otherwise nest the whole
if (length(grouped_cols) > 0) {
ret <- ret %>% tidyr::nest(-grouped_cols)
} else {
ret <- ret %>% tidyr::nest()
}
ret <- ret %>% dplyr::ungroup() # Remove rowwise grouping so that following mutate works as expected.
# Retrieve model, test index and source data stored in model_and_data_col column (list) and store them in separate columns
ret <- ret %>% dplyr::mutate(model = purrr::map(data, function(df){
df[[model_and_data_col]][[1]]$model
})) %>%
dplyr::mutate(.test_index = purrr::map(data, function(df){
df[[model_and_data_col]][[1]]$test_index
})) %>%
dplyr::mutate(source.data = purrr::map(data, function(df){
data <- df[[model_and_data_col]][[1]]$source_data
if (length(grouped_cols) > 0 && !is.null(data)) {
data %>% dplyr::select(-grouped_cols)
} else {
data
}
})) %>%
dplyr::select(-data)
# Rowwise grouping has to be redone with original grouped_cols, so that summarize(tidy(model)) later can add back the group column.
if (length(grouped_cols) > 0) {
ret <- ret %>% dplyr::rowwise(grouped_cols)
} else {
ret <- ret %>% dplyr::rowwise()
}
# If all the groups are errors, it would be hard to handle resulting data frames
# at the chart preprocessors. Hence, we instead stop the processing here
# and just throw the error from the first group.
if (purrr::every(ret$model, function(x) {"error" %in% class(x)})) {
stop(ret$model[[1]])
}
ret
}
# This is used from Analytics View only when classification type is regression.
#' @export
glance.xgboost_exp <- function(x, pretty.name = FALSE, ...) {
if ("error" %in% class(x)) {
ret <- data.frame(Note = x$message)
return(ret)
}
if ("xgboost_reg" %in% class(x)) {
glance.ranger.method <- glance.xgboost_exp.regression
}
else {
stop("glance.xgboost_exp should not be called for classification")
}
ret <- glance.ranger.method(x, pretty.name = pretty.name, ...)
ret
}
#' @export
glance.xgboost_exp.regression <- function(x, pretty.name, ...) {
predicted <- extract_predicted(x)
actual <- extract_actual(x)
root_mean_square_error <- rmse(predicted, actual)
rsq <- r_squared(actual, predicted)
n <- length(actual)
ret <- data.frame(
# root_mean_square_error = sqrt(x$prediction.error),
# r_squared = x$r.squared
r_squared = rsq,
root_mean_square_error = root_mean_square_error,
n = n
)
if(pretty.name){
map = list(
`R Squared` = as.symbol("r_squared"),
`RMSE` = as.symbol("root_mean_square_error"),
`Rows` = as.symbol("n")
)
ret <- ret %>%
dplyr::rename(!!!map)
}
ret
}
#' @export
#' @param type "importance", "evaluation" or "conf_mat". Feature importance, evaluated scores or confusion matrix of training data.
tidy.xgboost_exp <- function(x, type = "importance", pretty.name = FALSE, binary_classification_threshold = 0.5, ...) {
if ("error" %in% class(x) && type != "evaluation") {
ret <- data.frame()
return(ret)
}
switch(
type,
importance = {
if ("error" %in% class(x$imp_df)) {
# Permutation importance is not supported for the family and link function, or skipped because there is only one variable.
# Return empty data.frame to avoid error.
ret <- data.frame()
return(ret)
}
ret <- x$imp_df
ret <- ret %>% dplyr::mutate(variable = x$terms_mapping[variable]) # map variable names to original.
ret
},
evaluation = {
# Delegate showing error for failed models to grance().
if ("error" %in% class(x)) {
return(glance(x, pretty.name = pretty.name, ...))
}
# get evaluation scores from training data
actual <- extract_actual(x)
if(is.numeric(actual)){
glance(x, pretty.name = pretty.name, ...)
} else {
if (x$classification_type == "binary") {
predicted <- extract_predicted_binary_labels(x, threshold = binary_classification_threshold)
predicted_probability <- extract_predicted(x)
ret <- evaluate_binary_classification(actual, predicted, predicted_probability, pretty.name = pretty.name)
}
else {
predicted <- extract_predicted_multiclass_labels(x)
ret <- evaluate_multi_(data.frame(predicted=predicted, actual=actual), "predicted", "actual", pretty.name = pretty.name)
}
ret
}
},
evaluation_by_class = { # We assume this is called only for multiclass classification.
# Delegate showing error for failed models to grance().
if ("error" %in% class(x)) {
return(glance(x, pretty.name = pretty.name, ...))
}
# get evaluation scores from training data
actual <- extract_actual(x)
predicted <- extract_predicted_multiclass_labels(x)
per_level <- function(level) {
ret <- evaluate_classification(actual, predicted, level, pretty.name = pretty.name)
ret
}
dplyr::bind_rows(lapply(levels(actual), per_level))
},
conf_mat = {
# return confusion matrix
actual <- extract_actual(x)
if (x$classification_type == "binary") {
predicted <- extract_predicted_binary_labels(x, threshold = binary_classification_threshold)
}
else {
predicted <- extract_predicted_multiclass_labels(x)
}
ret <- calc_conf_mat(actual, predicted)
ret
},
partial_dependence = {
ret <- handle_partial_dependence(x)
ret
},
evaluation_log = {
# data frame with
# number of iteration
# with chosen evaluation metrics
ret <- x$evaluation_log %>% as.data.frame()
ret <- ret %>% tidyr::pivot_longer(cols = c(-iter))
ret <- ret %>% tidyr::separate(col = "name", into=c("type","name"))
ret
},
{
stop(paste0("type ", type, " is not defined"))
})
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.