#' This function guesses sensible parameters for setting up the
#' xgb model. If the user has supplied parameters, the function
#' chooses the ones least likely to lead to overfitting.
#'
#' Objective functions and eval metrics are always taken
#' to be the user supplied values, if the suer has supplied them,
#' else they are guessed based on the type of the target variable.
#'
#' @param train_structure is the data structure produced by prepare_training_set
#' @param n_estimators minimum number of estimators
#' @param learning_rate maximum learning rate
#' @param depth minimum depth of trees
#' @param nrounds the number of rounds of training
#' @param objective_function the objective function to be used
#' @param eval_metric the evaluation metric
#' @export
guess_hyperparameters <- function(train_structure,
n_estimators = 10,
learning_rate = 0.05,
depth = 2,
nrounds = 100,
objective_function = NA,
eval_metric = NA){
hyperparameters <- list()
features <-Matrix::sparse.model.matrix(as.formula(paste(train_structure$target_variable, "~ .")),
data = train_structure$data)[,-1]
width <- ncol(features)
height <- nrow(features)
hyperparameters[["depth"]] <- max(depth, floor(sqrt(width)))
hyperparameters[["n_estimators"]] <- {floor(max(n_estimators,
exp(floor(log(height)))/
hyperparameters[["depth"]])) %>% log() %>% `-`(1) %>% exp() %>% floor()}
hyperparameters[["learning_rate"]] <- min(learning_rate,
1/(log(hyperparameters[["n_estimators"]]*
hyperparameters[["depth"]])
)
)
hyperparameters[['alpha']] <- log(hyperparameters[["n_estimators"]]) * hyperparameters[["depth"]] * hyperparameters[["learning_rate"]]
hyperparameters[['lambda']] <- log(hyperparameters[["n_estimators"]]) * hyperparameters[["depth"]] * hyperparameters[["learning_rate"]]
class_target <- ("target_reference" %in% names(train_structure))
if(class_target){
hyperparameters[['rf_probability']] <- TRUE
hyperparameters[["objective_function"]] <- "multi:softprob"
hyperparameters[["eval_metric"]] <- "mlogloss"
hyperparameters[["num_class"]] <- train_structure$data[[train_structure$target_variable]] %>% dplyr::n_distinct()
if (nrow(train_structure[["target_reference"]]) > 2){
hyperparameters[["glmnet_family"]] <- "multinomial"
} else {
hyperparameters[["glmnet_family"]] <- "binomial"
}
} else {
hyperparameters[['rf_probability']] <- FALSE
hyperparameters[["objective_function"]] <- "reg:linear"
hyperparameters[["eval_metric"]] <- "mae"
hyperparameters[["glmnet_family"]] <- "gaussian"
}
if(!is.na(objective_function)){
hyperparameters[["objective_function"]] <- objective_function
}
if(!is.na(eval_metric)){
hyperparameters[["eval_metric"]] <- eval_metric
}
hyperparameters[["nrounds"]] <- max(nrounds, floor(hyperparameters[["depth"]]*log(hyperparameters[["n_estimators"]])))
hyperparameters[['rf_trees']] <- floor(sqrt(hyperparameters[["depth"]]) * exp(floor(log(height)/2)))
hyperparameters[['rf_mtry']] <- min(floor(sqrt(ncol(train_structure$data) + floor(log(height)))), ncol(train_structure$data))
return(hyperparameters)
}
#' this function takes the train structure and hyperparameters, and an integer n
#' and does an n fold cross validation and returns the result.
#' @param train_structure the data structure generated by prepare_training_set
#' @param hyperparameters the list generated by guess_hyperparameters function
#' @param nfold number of folds of CV
#' @export
cross_validate_xgb <- function(train_structure, hyperparameters, nfold = 5){
# print("defining the xgb parameters")
xgb_params <- list("objective" = hyperparameters[["objective_function"]],
"eval_metric" = hyperparameters[["eval_metric"]],
"eta" = hyperparameters[["learning_rate"]],
"max_depth" = hyperparameters[["depth"]],
"n_estimators" = hyperparameters[["n_estimators"]],
"alpha" = hyperparameters[["alpha"]],
"lambda" = hyperparameters[["lambda"]])
if("num_class" %in% names(hyperparameters)){
xgb_params[["num_class"]] <- hyperparameters[["num_class"]]
}
# print("building the sparse model matrix")
features <-Matrix::sparse.model.matrix(as.formula(paste(train_structure$target_variable, "~ .")),
data = train_structure$data)[,-1]
# print("identifying the labels")
lab <- train_structure$data[[train_structure$target_variable]]
# print("building the xgb DMatrix for training")
dtrain <- xgboost::xgb.DMatrix(data = features, label = lab)
# print("running the cross validation")
cv_model <- xgboost::xgb.cv(params = xgb_params,
data = dtrain,
verbose = F,
nfold = nfold,
nrounds = hyperparameters[["nrounds"]],
prediction = T)
ret_struct <- list()
ret_struct[["cv_model"]] <- cv_model
# print("computing useful metrics")
if("num_class" %in% names(hyperparameters)){
OOF_prediction <- tibble::tibble(cv_model$pred) %>%
dplyr::mutate(max_prob = max.col(., ties.method = "last")) %>%
dplyr::mutate(label = lab+1)
cm <- caret::confusionMatrix(factor(OOF_prediction$max_prob),
factor(OOF_prediction$label),
mode = "everything")
ret_struct[["confusion_matrix"]] <- cm
ret_struct[["metric"]] <- cv_model$evaluation_log$test_mlogloss_mean[length(cv_model$evaluation_log$test_mlogloss_mean)]
} else {
ret_struct[["metric"]] <- cv_model$evaluation_log$test_mae_mean[length(cv_model$evaluation_log$test_mae_mean)]
}
return(ret_struct)
}
#' this function returns a structure that contains the xgb model as well as
#' everything else it needs (normalization factors, levels of variables etc)
#' a predict function would need
#' @param train_structure the data structure generated by prepare_training_set
#' @param hyperparameters the list generated by guess_hyperparameters function
#' @export
train_model_xgb <- function(train_structure, hyperparameters){
xgb_params <- list("objective" = hyperparameters[["objective_function"]],
"eval_metric" = hyperparameters[["eval_metric"]],
"eta" = hyperparameters[["learning_rate"]],
"max_depth" = hyperparameters[["depth"]],
"n_estimators" = hyperparameters[["n_estimators"]])
if("num_class" %in% names(hyperparameters)){
xgb_params[["num_class"]] <- hyperparameters[["num_class"]]
}
features <-Matrix::sparse.model.matrix(as.formula(paste(train_structure$target_variable, "~ .")),
data = train_structure$data)[,-1]
lab <- train_structure$data[[train_structure$target_variable]]
dtrain <- xgboost::xgb.DMatrix(data = features, label = lab)
xgbmodel <- xgboost::xgb.train(params = xgb_params,
data = dtrain,
verbose = F,
nrounds = hyperparameters[["nrounds"]],
prediction = T)
model_structure <- list()
model_structure[['models']] <- list()
model_structure[['models']][['model_xgb']] <- xgbmodel
if('num_class' %in% names(hyperparameters)){
model_structure[['target_reference']] <- train_structure[['target_reference']]
}
model_structure[['normalize_by']] <- train_structure[['normalize_by']]
model_structure[['levels']] <- train_structure[['levels']]
model_structure[['target_variable']] <- train_structure[['target_variable']]
return(model_structure)
}
#' this function returns a structure that contains the linear model as well as
#' everything already contained in the train_xgb return structure
#' @param train_structure the data structure generated by prepare_training_set
#' @param hyperparameters the list generated by guess_hyperparameters function
#' @param model_structure the model structure created by train xgb
#' @export
train_linear_model <- function(train_structure, model_structure, hyperparameters){
doParallel::registerDoParallel(8)
features <-Matrix::sparse.model.matrix(as.formula(paste(train_structure$target_variable, "~ .")),
data = train_structure$data)[,-1]
lab <- train_structure$data[[train_structure$target_variable]]
linear_model <- glmnet::cv.glmnet(features, lab, family = hyperparameters[["glmnet_family"]], parallel = TRUE)
model_structure[['models']][['linear_model']] <- linear_model
return(model_structure)
}
#' this function returns a structure that contains the rf model as well as
#' everything already contained in the train_xgb return structure
#' @param train_structure the data structure generated by prepare_training_set
#' @param hyperparameters the list generated by guess_hyperparameters function
#' @param model_structure the model structure created by train xgb
#' @export
train_rf_model <- function(train_structure, model_structure, hyperparameters){
rf_model <- ranger::ranger(dependent.variable.name=train_structure[["target_variable"]],
# formula = as.formula(paste(train_structure$target_variable, "~ .")),
data = train_structure$data,
num.trees = hyperparameters[['rf_trees']],mtry = hyperparameters[['rf_mtry']],
probability = hyperparameters[['rf_probability']],
save.memory = TRUE
)
model_structure[['models']][['rf_model']] <- rf_model
return(model_structure)
}
#' this functions deals with categorical variables in the test set
#' which are not present in the training set.
#' @param model_structure the model strucxture created by xgb
#' @param features is the output of xgboost::xgb.DMatrix for the test set after combining with the levels from training set
#' @export
handle_new_test_levels <- function(model_structure, features){
test_names <- colnames(features)
train_names <- model_structure[['models']][['model_xgb']][['feature_names']]
extra_test_names <- setdiff(test_names, train_names)
features <- features[,!colnames(features) %in% extra_test_names]
return(features)
}
#' this function takes the model structure generated by train_model_xgb, along with
#' a test set in the same format as the untransformed *input* df
#' to the prepare_training_set function, to return a prediction vector
#' in the untransformed df.
#' @param model_structure model structure cresated earlier
#' @param test_df the test data frame
#' @export
get_predictions_xgb <- function (model_structure, test_df)
{
levels_df <- model_structure[["levels"]]
test_cols <- colnames(test_df)
level_cols <- colnames(levels_df)
for (i in 1:length(level_cols)) {
if (!(level_cols[i] %in% test_cols))
levels_df[[level_cols[[i]]]] <- NULL
}
test_df[[model_structure[["target_variable"]]]] <- NULL
test_df <- rationalize_categoricals(test_df)
norm_test_df <- normalize_df(test_df, facs_df = model_structure[["normalize_by"]],
target_variable = model_structure[["target_variable"]])
norm_test_df <- rbind(levels_df,norm_test_df)
norm_test_df <- handle_missing_values(norm_test_df,
target_variable = model_structure[["target_variable"]],
train_facs = model_structure[["normalize_by"]])
norm_test_df[[model_structure[["target_variable"]]]] <- 0
features <- Matrix::sparse.model.matrix(stats::as.formula(paste(model_structure[["target_variable"]],"~ .")),
data = norm_test_df, row.names = F)[, -1]
features <- handle_new_test_levels(model_structure = model_structure, features = features)
# print(colnames(features))
dtest <- xgboost::xgb.DMatrix(data = features)
preds <- predict(model_structure[["models"]][['model_xgb']], dtest)
if (model_structure[["models"]][['model_xgb']][["params"]][["objective"]] ==
"multi:softprob") {
prob_matrix <- matrix(preds, nrow = nrow(norm_test_df),
byrow = T)
predictions <- tibble::as_tibble(prob_matrix) %>% tail(nrow(test_df))
if(length(as.character(model_structure[["target_reference"]][[1]]))==2){
predictions <- predictions %>%
mutate(V2 = 1-V1)
}
colnames(predictions) <- as.character(model_structure[["target_reference"]][[1]])
cat_df <- predictions %>% tibble::rownames_to_column("row_id") %>%
dplyr::mutate(row_id = as.numeric(row_id)) %>%
tidyr::gather(category, value, -row_id) %>%
dplyr::group_by(row_id) %>%
dplyr::slice(which.max(value)) %>%
dplyr::arrange(row_id)
predictions[["category"]] <- cat_df[["category"]]
}
else {
predictions <- tibble::tibble(prediction = preds[1:nrow(test_df)])
colnames(predictions) <- model_structure[["target_variable"]]
}
return(predictions)
}
#' this function takes the model structure generated by train_linear_model, along with
#' a test set in the same format as the untransformed *input* df
#' to the prepare_training_set function, to return a prediction vector
#' in the untransformed df.
#' @param model_structure model structure cresated earlier
#' @param test_df the test data frame
#' @export
get_predictions_linear <- function(model_structure, test_df){
levels_df <- model_structure[["levels"]]
test_cols <- colnames(test_df)
level_cols <- colnames(levels_df)
for (i in 1:length(level_cols)) {
if (!(level_cols[i] %in% test_cols))
levels_df[[level_cols[[i]]]] <- NULL
}
test_df[[model_structure[["target_variable"]]]] <- NULL
test_df <- rationalize_categoricals(test_df)
norm_test_df <- normalize_df(test_df, facs_df = model_structure[["normalize_by"]],
target_variable = model_structure[["target_variable"]])
norm_test_df <- rbind(levels_df,norm_test_df)
norm_test_df <- handle_missing_values(norm_test_df,
target_variable = model_structure[["target_variable"]],
train_facs = model_structure[["normalize_by"]])
norm_test_df[[model_structure[["target_variable"]]]] <- 0
features <- Matrix::sparse.model.matrix(stats::as.formula(paste(model_structure[["target_variable"]],"~ .")),
data = norm_test_df, row.names = F)[, -1]
features <- handle_new_test_levels(model_structure = model_structure, features = features)
if (model_structure[["models"]][['model_xgb']][["params"]][["objective"]] ==
"multi:softprob") {
preds <- plogis(stats::predict(model_structure[["models"]][['linear_model']],
newx = features))
prob_matrix <- matrix(preds, nrow = nrow(norm_test_df),
byrow = T)
predictions <- tibble::as_tibble(prob_matrix) %>% tail(nrow(test_df))
if(length(as.character(model_structure[["target_reference"]][[1]]))==2){
predictions <- predictions %>%
mutate(V2 = V1) %>%
mutate(V1 = 1-V2)
}
colnames(predictions) <- as.character(model_structure[["target_reference"]][[1]])
cat_df <- predictions %>% tibble::rownames_to_column(var = "row_id")
cat_df %>%
dplyr::mutate(row_id = as.numeric(row_id)) -> cat_df
cat_df %>%
tidyr::gather(category, value, -row_id) -> cat_df
cat_df %>%
dplyr::group_by(row_id) %>%
dplyr::slice(which.max(value)) %>%
dplyr::arrange(row_id) -> cat_df
predictions[["category"]] <- cat_df[["category"]]
}
else {
preds <- stats::predict(model_structure[["models"]][['linear_model']],
newx = features)
predictions <- tibble::tibble(prediction = preds[1:nrow(test_df)])
colnames(predictions) <- model_structure[["target_variable"]]
}
return(predictions)
}
#' this function takes the model structure generated by train_linear_model, along with
#' a test set in the same format as the untransformed *input* df
#' to the prepare_training_set function, to return a prediction vector
#' in the untransformed df.
#' @param model_structure model structure cresated earlier
#' @param test_df the test data frame
#' @export
get_predictions_rf <- function(model_structure, test_df){
levels_df <- model_structure[["levels"]]
test_cols <- colnames(test_df)
level_cols <- colnames(levels_df)
for (i in 1:length(level_cols)) {
if (!(level_cols[i] %in% test_cols))
levels_df[[level_cols[[i]]]] <- NULL
}
test_df[[model_structure[["target_variable"]]]] <- NULL
test_df <- rationalize_categoricals(test_df)
norm_test_df <- normalize_df(test_df, facs_df = model_structure[["normalize_by"]],
target_variable = model_structure[["target_variable"]])
norm_test_df <- rbind(levels_df,norm_test_df)
norm_test_df <- handle_missing_values(norm_test_df,
target_variable = model_structure[["target_variable"]],
train_facs = model_structure[["normalize_by"]])
norm_test_df[[model_structure[["target_variable"]]]] <- 0
if (model_structure[["models"]][['model_xgb']][["params"]][["objective"]] ==
"multi:softprob") {
preds <- predict(model_structure[["models"]][['rf_model']],
data = norm_test_df)
prob_matrix <- preds[['predictions']]
predictions <- tibble::as_tibble(prob_matrix) %>% tail(nrow(test_df))
class_list <- model_structure$models$rf_model$forest$class.values
column_names_predictions <- list()
for (counter_classes in 1:length(class_list)){
column_names_predictions[counter_classes] <- model_structure[["target_reference"]][[1]][class_list[counter_classes]+1]
}
colnames(predictions) <- column_names_predictions
cat_df <- predictions %>% tibble::rownames_to_column("row_id") %>%
dplyr::mutate(row_id = as.numeric(row_id)) %>%
tidyr::gather(category, value, -row_id) %>%
dplyr::group_by(row_id) %>%
dplyr::slice(which.max(value)) %>%
dplyr::arrange(row_id)
# print(cat_df)
predictions[["category"]] <- cat_df[["category"]]
} else {
preds <- predict(model_structure[["models"]][['rf_model']],
data = norm_test_df)
preds <- preds[['predictions']]
predictions <- tibble::tibble(prediction = preds[1:nrow(test_df)])
colnames(predictions) <- model_structure[["target_variable"]]
}
return(predictions)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.